{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Web Scraping and Extraction with Firecrawl and Claude\n",
    "\n",
    "This notebook demonstrates how to use Firecrawl to scrape web content and Claude to extract structured data from it."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 1: Import Required Libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "import json\n",
    "from firecrawl import FirecrawlApp\n",
    "from anthropic import Anthropic\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "# Load environment variables\n",
    "load_dotenv()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 2: Set Up API Keys and URL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "URL to scrape: https://mendable.ai\n"
     ]
    }
   ],
   "source": [
    "# Retrieve API keys from environment variables\n",
    "anthropic_api_key = os.getenv(\"ANTHROPIC_API_KEY\")\n",
    "firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n",
    "\n",
    "# Set the URL to scrape\n",
    "url = \"https://mendable.ai\"  # Replace with the actual URL you want to scrape\n",
    "\n",
    "print(f\"URL to scrape: {url}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 3: Initialize Firecrawl and Anthropic Clients"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Firecrawl and Anthropic clients initialized.\n"
     ]
    }
   ],
   "source": [
    "# Initialize FirecrawlApp and Anthropic client\n",
    "firecrawl_app = FirecrawlApp(api_key=firecrawl_api_key)\n",
    "anthropic_client = Anthropic(api_key=anthropic_api_key)\n",
    "\n",
    "print(\"Firecrawl and Anthropic clients initialized.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 4: Scrape the URL using Firecrawl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Page content scraped. Length: 16199 characters\n"
     ]
    }
   ],
   "source": [
    "# Scrape the URL using Firecrawl\n",
    "page_content = firecrawl_app.scrape_url(url, params={\"pageOptions\": {\"onlyMainContent\": True}})\n",
    "\n",
    "print(f\"Page content scraped. Length: {len(page_content['content'])} characters\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 5: Prepare the Prompt for Claude"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Prompt prepared for Claude.\n"
     ]
    }
   ],
   "source": [
    "# Prepare the prompt for Claude\n",
    "prompt = f\"\"\"Analyze the following webpage content and extract the following information:\n",
    "1. The title of the page\n",
    "2. Whether the company is part of Y Combinator (YC)\n",
    "3. Whether the company/product is open source\n",
    "\n",
    "Return the information in JSON format with the following schema:\n",
    "{{\n",
    "    \"main_header_title\": string,\n",
    "    \"is_yc_company\": boolean,\n",
    "    \"is_open_source\": boolean\n",
    "}}\n",
    "\n",
    "Webpage content:\n",
    "{page_content['content']}\n",
    "\n",
    "Return only the JSON, nothing else.\"\"\"\n",
    "\n",
    "print(\"Prompt prepared for Claude.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 6: Query Claude"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Claude response received.\n"
     ]
    }
   ],
   "source": [
    "# Query Claude\n",
    "response = anthropic_client.messages.create(\n",
    "    model=\"claude-3-opus-20240229\",\n",
    "    max_tokens=1000,\n",
    "    messages=[\n",
    "        {\"role\": \"user\", \"content\": prompt}\n",
    "    ]\n",
    ")\n",
    "\n",
    "print(\"Claude response received.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 7: Parse and Display the Result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "  \"title\": \"Just in time answers for Sales and Support\",\n",
      "  \"is_yc_company\": true,\n",
      "  \"is_open_source\": false\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "# Parse and print the result\n",
    "result = json.loads(response.content[0].text)\n",
    "print(json.dumps(result, indent=2))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
